import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error
from statsmodels.tsa.statespace.sarimax import SARIMAX
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import os

os.chdir("C:/Users/agusv/Desktop/Estudio/Tesis/Csv")
data = pd.read_csv("multivariate_hvo100.csv", parse_dates=['Date'])
data.set_index('Date', inplace=True)
hvo100_prices = data['HVO100 Price']

train_data = data[data.index.year >= 2023]
test_data = data[data.index.year == 2024]

X_training = train_data[['Brent Spot Price', 'Inflation', 'USD/EUR']]
y_training = train_data['HVO100 Price']
X_testing = test_data[['Brent Spot Price', 'Inflation', 'USD/EUR']]
y_testing = test_data['HVO100 Price']

linear_model = LinearRegression().fit(X_training, y_training)
y_pred_linear = linear_model.predict(X_testing)
rmse_linear = np.sqrt(mean_squared_error(y_testing, y_pred_linear))
mape_linear = np.mean(np.abs((y_testing - y_pred_linear) / y_testing)) * 100

residuals_linear = y_testing - y_pred_linear
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_testing, residuals_linear, test_size=0.2, random_state=42)
xgboost_model_linear = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=3, min_child_weight=5, subsample=0.8, colsample_bytree=0.8, random_state=42)
xgboost_model_linear.fit(X_train_split, y_train_split)
y_pred_linear_xgb = y_pred_linear + xgboost_model_linear.predict(X_testing)
rmse_linear_xgb = np.sqrt(mean_squared_error(y_testing, y_pred_linear_xgb))
mape_linear_xgb = np.mean(np.abs((y_testing - y_pred_linear_xgb) / y_testing)) * 100

sarimax_model_residuals = SARIMAX(residuals_linear, order=(2, 0, 2))
sarimax_residuals_results = sarimax_model_residuals.fit(disp=False)
y_pred_linear_sarimax = y_pred_linear + sarimax_residuals_results.predict(start=X_testing.index[0], end=X_testing.index[-1])
rmse_linear_sarimax = np.sqrt(mean_squared_error(y_testing, y_pred_linear_sarimax))
mape_linear_sarimax = np.mean(np.abs((y_testing - y_pred_linear_sarimax) / y_testing)) * 100

arimax_model = SARIMAX(y_training, exog=X_training, order=(0, 1, 1), seasonal_order=(1, 0, 0, 12))
arimax_results = arimax_model.fit(disp=False)
y_p_arimax = arimax_results.predict(start=X_testing.index[0], end=X_testing.index[-1], exog=X_testing)
rmse_arimax = np.sqrt(mean_squared_error(y_testing, y_p_arimax))
mape_arimax = np.mean(np.abs((y_testing - y_p_arimax) / y_testing)) * 100
"""
from pmdarima import auto_arima
def fit_auto_arimax(series, exog=None, seasonal=True, m=12):
    auto_model = auto_arima(
        series,
        exogenous=exog,
        seasonal=seasonal,
        m=m,
        trace=True,
        error_action="ignore", 
        suppress_warnings=True,
        stepwise=True,
        n_jobs=-1
    )
    return auto_model

# Ajustar modelos auto ARIMAX para cada variable
modeli = fit_auto_arimax(y_training, seasonal=True, m=12)
"""
"""
from statsmodels.tsa.stattools import adfuller
from pmdarima import auto_arima
auto_arima(y_training).summary()
auto_arima(residuals_linear).summary()
adf_test = adfuller(y_training.dropna())
print(f"ADF Test Statistic: {adf_test[0]}, P-value: {adf_test[1]}")
"""
plt.figure(figsize=(14, 6))
plt.plot(hvo100_prices[hvo100_prices.index.year >= 2024], label="Real Diesel Price [€/lt]", color='blue')
plt.plot(test_data.index, y_pred_linear, label="Linear", color='orange', linestyle='--')
plt.plot(test_data.index, y_p_arimax, label="ARIMAX", color='red', linestyle='--')
plt.plot(test_data.index, y_pred_linear_xgb, label="Linear + XGBoost", color='purple', linestyle='--')
plt.plot(test_data.index, y_pred_linear_sarimax, label="Linear + SARIMAX", color='pink', linestyle='--')

plt.xlabel("Date")
plt.ylabel("HVO100 Price [€/lt]")
plt.title("Prediction Comparison")
plt.legend()
plt.show()

error_analysing = {
    'Model': ['ARIMAX', 'Linear', 'Linear + XGBoost', 'Linear + SARIMAX'],
    'MAPE [%]': [mape_arimax, mape_linear, mape_linear_xgb, mape_linear_sarimax],
    'RMSE': [rmse_arimax, rmse_linear, rmse_linear_xgb, rmse_linear_sarimax]
}
errores = pd.DataFrame(error_analysing)
print(errores)
